notebook.community

Edit and run



In [1]:

    
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re



In [2]:

    
urls = ["http://mail.scipy.org/pipermail/ipython-dev/",
        "http://mail.scipy.org/pipermail/ipython-user/"]#,
        #"http://mail.scipy.org/pipermail/scipy-dev/",
        #"http://mail.scipy.org/pipermail/scipy-user/",
        #"http://mail.scipy.org/pipermail/numpy-discussion/"]


archives= [Archive(url,archive_dir="../archives") for url in urls]









    



Opening 138 archive files
Opening 139 archive files



In [3]:

    
act = archives[0].get_activity()
act1 = archives[1].get_activity()









    



/Users/jiabinchen/Desktop/Research/asd/bigbang/bigbang/archive.py:124: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  mdf2['Date'] = mdf['Date'].apply(lambda x: x.toordinal())



In [4]:

    
fig = plt.figure(figsize=(12.5, 7.5))

#act.idxmax().order().T.plot()
(act > 0).idxmax().order().plot()

fig.axes[0].yaxis_date()



In [5]:

    
timeorder = (act > 0).idxmax().order()
timeorder1 = (act1 > 0).idxmax().order()



In [6]:

    
archives[1].data[:2]









    Out[6]:






  
    
      
      From
      Subject
      Date
      In-Reply-To
      References
      Body
    
    
      Message-ID
      
      
      
      
      
      
    
  
  
    
      <3271DBB88437ED41A0AB239E6C2554A401117873@ussunm001.palmsource.com>
          Robin.Siebler@palmsource.com (Robin Siebler)
       [IPython-user] Crash
       2003-03-27 12:27:08-08:00
       None
       None
       I installed IPython-0.2.15pre3, played with it...
    
    
      <3271DBB88437ED41A0AB239E6C2554A401117873@ussunm001.palmsource.com>
       Robin.Siebler at palmsource.com (Robin Siebler)
       [IPython-user] Crash
       2003-03-27 12:27:08-08:00
       None
       None
       I installed IPython-0.2.15pre3, played with it...
    
  

2 rows × 6 columns



In [7]:

    
for row in archives[0].data[:2].iterrows():
    print row[1]["Body"]









    



Hi all,

after a suggestion by Jacek Generowicz, someone (not me) sent in a request for 
indexing the ipython lists at gmane.  I didn't do it but I'm perfectly happy 
with it, so thanks to whoever did it.

For those not familiar with the service, http://gmane.org provides a mailing 
list to news bridge, which allows you to follow the ipython lists with a news 
reader.

Cheers,

Fernando.


Hi all,

after a suggestion by Jacek Generowicz, someone (not me) sent in a request for 
indexing the ipython lists at gmane.  I didn't do it but I'm perfectly happy 
with it, so thanks to whoever did it.

For those not familiar with the service, http://gmane.org provides a mailing 
list to news bridge, which allows you to follow the ipython lists with a news 
reader.

Cheers,

Fernando.



In [8]:

    
arx = archives[0]



In [9]:

    
k = pd.DataFrame



In [10]:

    
first_participation = {}
for row in archives[0].data.iterrows():
    if row[1]["From"] not in first_participation:
        first_participation[row[1]["From"]] = row[1]["Date"]



In [11]:

    
first_participation1 = {}
for row in archives[1].data.iterrows():
    if row[1]["From"] not in first_participation1:
        first_participation1[row[1]["From"]] = row[1]["Date"]



In [67]:

    
#First list
wordcount={}
for row in archives[0].data.iterrows():
    w = row[1]["Body"].replace("'", "")
    k = re.sub(r'[^\w]', ' ', w)
    t = nltk.tokenize.word_tokenize(k)
    for g in t:
        try:
            word = st.stem(g)
        except:
            print g
            pass
        if word in stopwords.words('english'):
            continue
        if word not in wordcount:
            wordcount[word] = [1]
            wordcount[word].append(row[0])
            wordcount[word].append(row[1]["Date"])
            wordcount[word].append(row[1]["From"])
            wordcount[word].append(row[1]["In-Reply-To"])
        else:
            wordcount[word][0] += 1
wd = wordcount #In case



In [13]:

    
#Second List
wordcount1={}
for row in archives[1].data.iterrows():
    w = row[1]["Body"].replace("'", "")
    k = re.sub(r'[^\w]', ' ', w)
    t = nltk.tokenize.word_tokenize(k)
    for g in t:
        try:
            word = st.stem(g)
        except:
            print g
            pass
        if word in stopwords.words('english'):
            continue
        if word not in wordcount1:
            wordcount1[word] = [1]
            wordcount1[word].append(row[0])
            wordcount1[word].append(row[1]["Date"])
            wordcount1[word].append(row[1]["From"])
            wordcount1[word].append(row[1]["In-Reply-To"])
        else:
            wordcount1[word][0] += 1



In [14]:

    
#new_df = pd.DataFrame(wordcount.items(),columns=["Word","Others"])



In [15]:

    
#pd.concat(pd.Series(wordcount.keys()),pd.DataFrame(wordcount.values(),columns=["A","B","C","D","E"]))



In [16]:

    
#Wordcount information dataframe, with rows as words.
asd = pd.DataFrame(wordcount)
new_dataframe = asd.transpose()
new_dataframe.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]



In [17]:

    
#Wordcount information dataframe, with rows as words.
asd1 = pd.DataFrame(wordcount1)
new_dataframe1 = asd1.transpose()
new_dataframe1.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]



In [18]:

    
len(wordcount) #Number of unique words in mailing list1









    Out[18]:





37146



In [19]:

    
len(wordcount1) #Number of unique words in mailing list2









    Out[19]:





45244



In [20]:

    
#Number of same unique words in two mailing lists
samewordcount=0
for word in wordcount:
    if word in wordcount1:
        samewordcount += 1
samewordcount









    Out[20]:





14688



In [21]:

    
#Total number of same words that are introduced by same people.
samecount = 0
for word in wordcount:
    if word in wordcount1:
        if wordcount[word][3] == wordcount1[word][3]:
            samecount += 1
samecount









    Out[21]:





2984



In [22]:

    
#Among 100-500 appearance words, the number of common words between two mailing-list.
samewordcount = 0
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                samewordcount += 1
samewordcount









    Out[22]:





808



In [23]:

    
#Among 100-500 appearance words, the number of common words between two mailing-list that are first
#introduced by same people
same_person_count = 0
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                if wordcount[word][3] == wordcount1[word][3]:
                    #print word
                    same_person_count += 1
samecount









    Out[23]:





2984



In [24]:

    
#common word list(introduced by different people in different lists)
commonwords = {}
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                if wordcount[word][3] != wordcount1[word][3]:
                    commonwords[word] = [wordcount[word][0],wordcount[word][3],wordcount[word][2],\
                                         wordcount1[word][0],wordcount1[word][3],wordcount1[word][2]]



In [25]:

    
len(commonwords)









    Out[25]:





669



In [27]:

    
#Dataframe of information of those words introduced by different people
df1 = pd.DataFrame(commonwords)
commonword_differentauthor_dataframe = df1.transpose()
commonword_differentauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
commonword_differentauthor_dataframe[:10]









    Out[27]:






  
    
      
      Wordcount1
      From1
      Date1
      Wordcount2
      From2
      Date2
    
  
  
    
      000
       117
                   cggame at consultant.com (Van Dyke)
       2003-08-30 13:44:50+02:00
       105
                 thomashadim99 at netscape.net (Ciwen)
       2003-10-19 15:00:32+00:00
    
    
      0000
       102
                      ralf@brainbot.com (Ralf Schmitt)
       2003-09-24 13:05:14+02:00
       185
               fperez at colorado.edu (Fernando Perez)
       2004-04-10 14:45:23+00:00
    
    
      0200
       369
              ipython at ml.schieke.net (Jaco Schieke)
       2004-08-02 18:19:31+00:00
       348
                        ero at dkbza.org (Ero Carrera)
       2004-05-09 08:11:36+00:00
    
    
      0600
       140
                  fperez@colorado.edu (Fernando Perez)
       2003-04-17 12:43:47-06:00
       108
             gareth at wiked.org (Gareth J. Greenaway)
       2003-04-15 17:03:05-07:00
    
    
      0700
       454
                        gb at cs.unc.edu (Gary Bishop)
       2003-12-03 10:00:56+00:00
       452
                  fperez@colorado.edu (Fernando Perez)
       2003-05-30 12:45:15-06:00
    
    
      1000
       116
                   cmoad at indiana.edu (Charles Moad)
       2005-02-21 08:47:43+00:00
       178
                 gillet@scripps.edu (Alexandre Gillet)
       2003-08-19 17:34:07-07:00
    
    
      2003
       184
                      cdodt@fcoe.k12.ca.us (Cory Dodt)
       2003-04-17 07:32:56-07:00
       168
       Robin.Siebler at palmsource.com (Robin Siebler)
       2003-03-27 13:13:13-08:00
    
    
      2004
       149
                        gb at cs.unc.edu (Gary Bishop)
       2004-02-03 08:50:22+00:00
       353
                         twl at sauria.com (Ted Leung)
       2004-01-09 13:14:37+00:00
    
    
      2005
       386
       Fernando.Perez at colorado.edu (Fernando Perez)
       2005-01-23 18:56:58+00:00
       489
                         jjl at pobox.com (John J Lee)
       2004-12-31 10:49:16+00:00
    
    
      2007
       419
               vivainio at gmail.com (Ville M. Vainio)
       2007-01-17 19:19:08+01:00
       442
                    bthom at cs.hmc.edu (belinda thom)
       2007-01-07 21:36:13-08:00
    
  

10 rows × 6 columns



In [28]:

    
commonword_differentauthor_dataframe['Date1'][0] < commonword_differentauthor_dataframe['Date1'][1]









    Out[28]:





True



In [29]:

    
len(commonwords)









    Out[29]:





669



In [30]:

    
# The list of words that have potential of idea flows. Definition: A is introduced by p in list1 first, then q saw it and then 
# introduced the word A to list 2, vice versa. We defined q saw as q said sth in list1 before p poped out the word. 
# Total list of such word A. 
time_influence = 0
influnce_list = {}
for word in commonwords:
    if commonwords[word][2] > commonwords[word][5]: #Author2 comes first
        if commonwords[word][1] in first_participation1: #Check if author1 in list2
            if first_participation1[commonwords[word][1]] < commonwords[word][5]: #Check if author1\
                #in list2 and exists before the word first introduced in list2
                influnce_list[word] = commonwords[word]
                time_influence += 1
    else: #Author1 comes first
        if commonwords[word][4] in first_participation:
            if first_participation[commonwords[word][4]] < commonwords[word][2]:
                influnce_list[word] = commonwords[word]
                time_influence += 1



In [31]:

    
time_influence









    Out[31]:





235



In [32]:

    
len(influnce_list.keys())









    Out[32]:





235



In [34]:

    
df2 = pd.DataFrame(influnce_list)
influnce_list_dataframe = df2.transpose()
influnce_list_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
influnce_list_dataframe[:20]









    Out[34]:






  
    
      
      Wordcount1
      From1
      Date1
      Wordcount2
      From2
      Date2
    
  
  
    
      0000
       102
                        ralf@brainbot.com (Ralf Schmitt)
       2003-09-24 13:05:14+02:00
       185
                 fperez at colorado.edu (Fernando Perez)
       2004-04-10 14:45:23+00:00
    
    
      0600
       140
                    fperez@colorado.edu (Fernando Perez)
       2003-04-17 12:43:47-06:00
       108
               gareth at wiked.org (Gareth J. Greenaway)
       2003-04-15 17:03:05-07:00
    
    
      0700
       454
                          gb at cs.unc.edu (Gary Bishop)
       2003-12-03 10:00:56+00:00
       452
                    fperez@colorado.edu (Fernando Perez)
       2003-05-30 12:45:15-06:00
    
    
      2004
       149
                          gb at cs.unc.edu (Gary Bishop)
       2004-02-03 08:50:22+00:00
       353
                           twl at sauria.com (Ted Leung)
       2004-01-09 13:14:37+00:00
    
    
      2005
       386
         Fernando.Perez at colorado.edu (Fernando Perez)
       2005-01-23 18:56:58+00:00
       489
                           jjl at pobox.com (John J Lee)
       2004-12-31 10:49:16+00:00
    
    
      2007
       419
                 vivainio at gmail.com (Ville M. Vainio)
       2007-01-17 19:19:08+01:00
       442
                      bthom at cs.hmc.edu (belinda thom)
       2007-01-07 21:36:13-08:00
    
    
      404
       210
         Fernando.Perez at colorado.edu (Fernando Perez)
       2006-01-16 11:29:09-07:00
       147
       mantegazza at ill.fr (=?iso-8859-15?q?Fr=E9d=E...
       2005-04-06 03:12:39+00:00
    
    
      43
       402
                             gb@cs.unc.edu (Gary Bishop)
       2003-05-24 08:46:18+00:00
       318
                    fperez@colorado.edu (Fernando Perez)
       2003-05-30 12:45:15-06:00
    
    
      47
       297
                    fperez@colorado.edu (Fernando Perez)
       2003-04-17 12:43:47-06:00
       297
                 fperez at colorado.edu (Fernando Perez)
       2003-04-17 12:43:47-06:00
    
    
      52
       351
                     ralf at brainbot.com (Ralf Schmitt)
       2003-09-22 16:30:47+02:00
       305
                 fperez at colorado.edu (Fernando Perez)
       2004-04-10 14:45:23+00:00
    
    
      58
       357
                          gb at cs.unc.edu (Gary Bishop)
       2003-05-24 16:33:59+00:00
       363
                    fperez@colorado.edu (Fernando Perez)
       2003-05-30 12:45:15-06:00
    
    
      _ip
       193
                    vivainio at gmail.com (Ville Vainio)
       2006-01-30 23:18:54+02:00
       220
        oliphant.travis at ieee.org (Travis E. Oliphant)
       2006-01-24 16:11:40-07:00
    
    
      ac
       264
       prabhu at aero.iitm.ernet.in (Prabhu Ramachand...
       2004-09-29 12:37:41+00:00
       218
                         jhsh@sun.ac.za (Jannie Hofmeyr)
       2003-08-29 11:34:13+02:00
    
    
      acceiv
       439
                 fperez at colorado.edu (Fernando Perez)
       2003-05-13 13:13:27-06:00
       373
                             gb@cs.unc.edu (Gary Bishop)
       2003-05-13 15:11:30+00:00
    
    
      across
       409
                    fperez@colorado.edu (Fernando Perez)
       2003-05-20 16:12:53-06:00
       343
                 fperez at colorado.edu (Fernando Perez)
       2003-05-20 16:12:53-06:00
    
    
      admin
       177
                 fperez at colorado.edu (Fernando Perez)
       2003-10-07 10:35:56+00:00
       262
                     tony at tcapp.com (Tony Cappellini)
       2003-04-14 21:56:14-07:00
    
    
      affect
       296
       pythondev-dang at lazytwinacres.net (Daniel 'D...
       2004-06-24 14:18:14+00:00
       185
         Fernando.Perez at colorado.edu (Fernando Perez)
       2004-08-13 11:05:14+00:00
    
    
      afraid
       227
                 fperez at colorado.edu (Fernando Perez)
       2003-09-30 15:53:03+00:00
       289
                    fperez@colorado.edu (Fernando Perez)
       2003-06-30 17:57:40-06:00
    
    
      alex
       103
                 fperez at colorado.edu (Fernando Perez)
       2003-04-17 13:12:21-06:00
       418
                    fperez@colorado.edu (Fernando Perez)
       2003-04-23 10:42:42-06:00
    
    
      alph
       114
                    fperez@colorado.edu (Fernando Perez)
       2003-05-20 16:12:53-06:00
       103
                 fperez at colorado.edu (Fernando Perez)
       2003-05-20 16:12:53-06:00
    
  

20 rows × 6 columns



In [35]:

    
influence_words = influnce_list.keys()



In [36]:

    
#reduce the words that only contain numbers (lack of information)
reduced_influence_words = [] 
for word in influence_words:
    if word.isdigit() == False:
        reduced_influence_words.append(word)



In [37]:

    
len(reduced_influence_words)









    Out[37]:





224



In [38]:

    
reduced_influence_words[:20]









    Out[38]:





['lsf',
 'osx',
 'construct',
 'mom',
 'foolscap',
 'lack',
 'gz',
 'tcp',
 'subprocess',
 'catch',
 'prefix',
 'sleep',
 'decl',
 u'query',
 'perfect',
 'cython',
 'busy',
 'ver',
 u'temp',
 'autocal']



In [39]:

    
#Store the list
import csv
with open('test123.csv', 'w') as fp:
    a = csv.writer(fp)
    data = [reduced_influence_words]
    a.writerows(data)

#reduced_influence_words.to_csv()

End of main contents of this notebook, below are some analysis of unique word lists



In [40]:

    
influnce_list_dataframe.keys()









    Out[40]:





Index([u'Wordcount1', u'From1', u'Date1', u'Wordcount2', u'From2', u'Date2'], dtype='object')



In [68]:

    
for key,value in wd.items():
    if value <= 100 or value >= 500:
        del wd[key]



In [69]:

    
wc_array = np.array(wd.values())



In [70]:

    
wc_array.sort()



In [72]:

    
len(wordcount)









    Out[72]:





0



In [66]:

    
#List1's unique words and their count, power law distribution
%matplotlib inline
plt.plot(wcsort_array[:,0])









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-66-518ff4ca5f3c> in <module>()
      1 #List1's unique words and their count, power law distribution
      2 get_ipython().magic(u'matplotlib inline')
----> 3 plt.plot(wcsort_array[:,0])

NameError: name 'wcsort_array' is not defined



In [12]:

    
t = nltk.tokenize.word_tokenize(p)



In [11]:

    
len(nltk.corpus.stopwords.words('english'))









    Out[11]:





127



In [20]:

    
a = []
for i in t:
    a.append(st.stem(i))



In [ ]:

	From	Subject	Date	In-Reply-To	References	Body
Message-ID
<3271DBB88437ED41A0AB239E6C2554A401117873@ussunm001.palmsource.com>	Robin.Siebler@palmsource.com (Robin Siebler)	[IPython-user] Crash	2003-03-27 12:27:08-08:00	None	None	I installed IPython-0.2.15pre3, played with it...
<3271DBB88437ED41A0AB239E6C2554A401117873@ussunm001.palmsource.com>	Robin.Siebler at palmsource.com (Robin Siebler)	[IPython-user] Crash	2003-03-27 12:27:08-08:00	None	None	I installed IPython-0.2.15pre3, played with it...

	Wordcount1	From1	Date1	Wordcount2	From2	Date2
000	117	cggame at consultant.com (Van Dyke)	2003-08-30 13:44:50+02:00	105	thomashadim99 at netscape.net (Ciwen)	2003-10-19 15:00:32+00:00
0000	102	ralf@brainbot.com (Ralf Schmitt)	2003-09-24 13:05:14+02:00	185	fperez at colorado.edu (Fernando Perez)	2004-04-10 14:45:23+00:00
0200	369	ipython at ml.schieke.net (Jaco Schieke)	2004-08-02 18:19:31+00:00	348	ero at dkbza.org (Ero Carrera)	2004-05-09 08:11:36+00:00
0600	140	fperez@colorado.edu (Fernando Perez)	2003-04-17 12:43:47-06:00	108	gareth at wiked.org (Gareth J. Greenaway)	2003-04-15 17:03:05-07:00
0700	454	gb at cs.unc.edu (Gary Bishop)	2003-12-03 10:00:56+00:00	452	fperez@colorado.edu (Fernando Perez)	2003-05-30 12:45:15-06:00
1000	116	cmoad at indiana.edu (Charles Moad)	2005-02-21 08:47:43+00:00	178	gillet@scripps.edu (Alexandre Gillet)	2003-08-19 17:34:07-07:00
2003	184	cdodt@fcoe.k12.ca.us (Cory Dodt)	2003-04-17 07:32:56-07:00	168	Robin.Siebler at palmsource.com (Robin Siebler)	2003-03-27 13:13:13-08:00
2004	149	gb at cs.unc.edu (Gary Bishop)	2004-02-03 08:50:22+00:00	353	twl at sauria.com (Ted Leung)	2004-01-09 13:14:37+00:00
2005	386	Fernando.Perez at colorado.edu (Fernando Perez)	2005-01-23 18:56:58+00:00	489	jjl at pobox.com (John J Lee)	2004-12-31 10:49:16+00:00
2007	419	vivainio at gmail.com (Ville M. Vainio)	2007-01-17 19:19:08+01:00	442	bthom at cs.hmc.edu (belinda thom)	2007-01-07 21:36:13-08:00